data = read.csv(file = "notes.csv")
names(data) <- c('authentique', 'diagonale', 'hauteur_g', 'hauteur_d', 'marge_bas', 'marge_haut', 'longueur')
head(data, 1)
data_test = data
data_test$auth <- ifelse (data_test$authentique == "True", 1, 0)
data_ok = select(data_test, subset = -c(1))
rm(data_test)
#Mission 0 - Afin d’introduire votre analyse, effectuez une brève description des données (analyses univariées (descriptions, densité, histogramme, boxplot) et bivariées).
#Fonction pour observer rapidement les données, la distribution des variables quali et la distribution des variables quanti
data_test = data
data_test$authentique = factor(data_test$authentique, levels = c('True', 'False'), labels = c('vrai_billet', 'faux_billet')) #On vérifie les niveaux de la variable quali dans le dataframe d'origine, et on relabel
basic_eda <- function(data_test)
{
glimpse(data_test)
print(status(data_test))
freq(data_test)
print(profiling_num(data_test))
plot_num(data_test)
describe(data_test)
}
basic_eda(data_test)
Rows: 170
Columns: 7
$ authentique <fct> vrai_billet, vrai_billet, vrai_billet, vrai_billet, vrai_billet, vrai_billet, vrai_billet, vrai_billet, vrai_billet, ~
$ diagonale <dbl> 171.81, 171.67, 171.83, 171.80, 172.05, 172.57, 172.38, 171.58, 171.96, 172.14, 172.27, 172.07, 172.19, 171.82, 172.0~
$ hauteur_g <dbl> 104.86, 103.74, 103.76, 103.78, 103.70, 104.65, 103.55, 103.65, 103.51, 104.34, 104.29, 103.64, 104.61, 103.78, 103.9~
$ hauteur_d <dbl> 104.95, 103.70, 103.76, 103.65, 103.75, 104.44, 103.80, 103.37, 103.75, 104.20, 104.22, 103.67, 103.69, 103.76, 103.7~
$ marge_bas <dbl> 4.52, 4.01, 4.40, 3.73, 5.04, 4.54, 3.97, 3.54, 4.06, 4.63, 3.89, 3.86, 4.00, 3.81, 3.81, 4.56, 4.07, 4.52, 3.60, 4.1~
$ marge_haut <dbl> 2.89, 2.87, 2.88, 3.12, 2.27, 2.99, 2.90, 3.19, 3.33, 3.02, 3.53, 3.20, 3.26, 3.25, 3.24, 2.56, 2.92, 2.71, 3.50, 3.0~
$ longueur <dbl> 112.83, 113.29, 113.84, 113.63, 113.55, 113.16, 113.30, 113.38, 113.53, 112.47, 113.50, 113.83, 112.91, 113.36, 113.4~
Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.
Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.
data_test
7 Variables 170 Observations
-------------------------------------------------------------------------------------------------------------------------------------------
authentique
n missing distinct
170 0 2
Value vrai_billet faux_billet
Frequency 100 70
Proportion 0.588 0.412
-------------------------------------------------------------------------------------------------------------------------------------------
diagonale
n missing distinct Info Mean Gmd .05 .10 .25 .50 .75 .90 .95
170 0 88 1 171.9 0.3422 171.5 171.6 171.7 171.9 172.1 172.3 172.5
lowest : 171.04 171.13 171.35 171.38 171.43, highest: 172.53 172.57 172.59 172.75 173.01
-------------------------------------------------------------------------------------------------------------------------------------------
hauteur_g
n missing distinct Info Mean Gmd .05 .10 .25 .50 .75 .90 .95
170 0 91 1 104.1 0.3411 103.6 103.7 103.8 104.1 104.3 104.5 104.6
lowest : 103.23 103.47 103.49 103.51 103.52, highest: 104.60 104.61 104.65 104.72 104.86
-------------------------------------------------------------------------------------------------------------------------------------------
hauteur_d
n missing distinct Info Mean Gmd .05 .10 .25 .50 .75 .90 .95
170 0 96 1 103.9 0.3749 103.4 103.5 103.7 104.0 104.2 104.3 104.4
lowest : 103.14 103.25 103.29 103.31 103.34, highest: 104.50 104.64 104.83 104.86 104.95
-------------------------------------------------------------------------------------------------------------------------------------------
marge_bas
n missing distinct Info Mean Gmd .05 .10 .25 .50 .75 .90 .95
170 0 123 1 4.612 0.7939 3.715 3.810 4.050 4.450 5.128 5.712 5.891
lowest : 3.54 3.60 3.64 3.65 3.66, highest: 6.00 6.01 6.16 6.19 6.28
-------------------------------------------------------------------------------------------------------------------------------------------
marge_haut
n missing distinct Info Mean Gmd .05 .10 .25 .50 .75 .90 .95
170 0 81 1 3.17 0.2655 2.800 2.890 3.012 3.170 3.330 3.470 3.591
lowest : 2.27 2.56 2.70 2.71 2.75, highest: 3.63 3.65 3.66 3.67 3.68
-------------------------------------------------------------------------------------------------------------------------------------------
longueur
n missing distinct Info Mean Gmd .05 .10 .25 .50 .75 .90 .95
170 0 129 1 112.6 1.038 110.8 111.3 111.9 112.8 113.3 113.6 113.7
lowest : 109.97 110.31 110.48 110.53 110.61, highest: 113.83 113.84 113.87 113.92 113.98
-------------------------------------------------------------------------------------------------------------------------------------------
#On a 7 variable : 1 qualitative ("authentique", qui prends pour valeur 0 si le billet est faux et 1 sinon), et 6 qualitatives, qui mesurent différents aspects du billet, avec pour unité le millimètre, et 2 décimales après la virgule.
# Les écarts-types des variables quantitatives sont très faibles (-1mm).
#Près de 59% des billets sont véritables (100 billets) contre 41% faux (70 billets).
#Pour un boxplot et densité de toutes les variables quanti, en fonction de la variable quali
df = data
fig1 <- df %>% plot_ly(x = ~authentique, y = ~diagonale, split = ~authentique, type = 'violin', box = list(visible = T), meanline = list(visible = T))
fig1 <- fig1 %>% layout(xaxis = list(title = "Authenticité"),yaxis = list(title = "Diagonale",zeroline = F))
fig2 <- df %>% plot_ly(x = ~authentique, y = ~hauteur_g, split = ~authentique, type = 'violin', box = list(visible = T), meanline = list(visible = T))
fig2 <- fig2 %>% layout(xaxis = list(title = "Authenticité"),yaxis = list(title = "Hauteur gauche",zeroline = F))
fig3 <- df %>% plot_ly(x = ~authentique, y = ~hauteur_d, split = ~authentique, type = 'violin', box = list(visible = T), meanline = list(visible = T))
fig3 <- fig3 %>% layout(xaxis = list(title = "Authenticité"),yaxis = list(title = "Hauteur droite",zeroline = F))
fig4 <- df %>% plot_ly(x = ~authentique, y = ~marge_bas, split = ~authentique, type = 'violin', box = list(visible = T), meanline = list(visible = T))
fig4 <- fig4 %>% layout(xaxis = list(title = "Authenticité"),yaxis = list(title = "Marge basse",zeroline = F))
fig5 <- df %>% plot_ly(x = ~authentique, y = ~marge_haut, split = ~authentique, type = 'violin', box = list(visible = T), meanline = list(visible = T))
fig5 <- fig5 %>% layout(xaxis = list(title = "Authenticité"),yaxis = list(title = "Marge haute",zeroline = F))
fig6 <- df %>% plot_ly(x = ~authentique, y = ~longueur, split = ~authentique, type = 'violin', box = list(visible = T), meanline = list(visible = T))
fig6 <- fig6 %>% layout(xaxis = list(title = "Authenticité"),yaxis = list(title = "Longueur",zeroline = F))
fig1
fig2
fig3
fig4
fig5
fig6
#Corrélogramme (analyse 2 par 2) des variables quanti en fonction de la variable quali, plus niveau de corrélation en général et en fonction de la variable quali, plus significativité du taux de corrélation
p <- ggpairs(data,
columns = 1:7,
ggplot2::aes(colour=authentique),
title="Corrélograme")
p
plot: [1,1] [=>-----------------------------------------------------------------------------------------------------------] 2% est: 0s
plot: [1,2] [===>---------------------------------------------------------------------------------------------------------] 4% est: 5s
plot: [1,3] [======>------------------------------------------------------------------------------------------------------] 6% est: 6s
plot: [1,4] [========>----------------------------------------------------------------------------------------------------] 8% est: 5s
plot: [1,5] [==========>--------------------------------------------------------------------------------------------------] 10% est: 5s
plot: [1,6] [============>------------------------------------------------------------------------------------------------] 12% est: 5s
plot: [1,7] [===============>---------------------------------------------------------------------------------------------] 14% est: 4s
plot: [2,1] [=================>-------------------------------------------------------------------------------------------] 16% est: 4s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [2,2] [===================>-----------------------------------------------------------------------------------------] 18% est: 4s
plot: [2,3] [=====================>---------------------------------------------------------------------------------------] 20% est: 4s
plot: [2,4] [=======================>-------------------------------------------------------------------------------------] 22% est: 4s
plot: [2,5] [==========================>----------------------------------------------------------------------------------] 24% est: 4s
plot: [2,6] [============================>--------------------------------------------------------------------------------] 27% est: 4s
plot: [2,7] [==============================>------------------------------------------------------------------------------] 29% est: 4s
plot: [3,1] [================================>----------------------------------------------------------------------------] 31% est: 4s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [3,2] [===================================>-------------------------------------------------------------------------] 33% est: 4s
plot: [3,3] [=====================================>-----------------------------------------------------------------------] 35% est: 4s
plot: [3,4] [=======================================>---------------------------------------------------------------------] 37% est: 3s
plot: [3,5] [=========================================>-------------------------------------------------------------------] 39% est: 3s
plot: [3,6] [===========================================>-----------------------------------------------------------------] 41% est: 3s
plot: [3,7] [==============================================>--------------------------------------------------------------] 43% est: 3s
plot: [4,1] [================================================>------------------------------------------------------------] 45% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [4,2] [==================================================>----------------------------------------------------------] 47% est: 3s
plot: [4,3] [====================================================>--------------------------------------------------------] 49% est: 3s
plot: [4,4] [=======================================================>-----------------------------------------------------] 51% est: 3s
plot: [4,5] [=========================================================>---------------------------------------------------] 53% est: 2s
plot: [4,6] [===========================================================>-------------------------------------------------] 55% est: 2s
plot: [4,7] [=============================================================>-----------------------------------------------] 57% est: 2s
plot: [5,1] [================================================================>--------------------------------------------] 59% est: 2s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [5,2] [==================================================================>------------------------------------------] 61% est: 2s
plot: [5,3] [====================================================================>----------------------------------------] 63% est: 2s
plot: [5,4] [======================================================================>--------------------------------------] 65% est: 2s
plot: [5,5] [========================================================================>------------------------------------] 67% est: 2s
plot: [5,6] [===========================================================================>---------------------------------] 69% est: 2s
plot: [5,7] [=============================================================================>-------------------------------] 71% est: 1s
plot: [6,1] [===============================================================================>-----------------------------] 73% est: 1s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [6,2] [=================================================================================>---------------------------] 76% est: 1s
plot: [6,3] [====================================================================================>------------------------] 78% est: 1s
plot: [6,4] [======================================================================================>----------------------] 80% est: 1s
plot: [6,5] [========================================================================================>--------------------] 82% est: 1s
plot: [6,6] [==========================================================================================>------------------] 84% est: 1s
plot: [6,7] [============================================================================================>----------------] 86% est: 1s
plot: [7,1] [===============================================================================================>-------------] 88% est: 1s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [7,2] [=================================================================================================>-----------] 90% est: 1s
plot: [7,3] [===================================================================================================>---------] 92% est: 0s
plot: [7,4] [=====================================================================================================>-------] 94% est: 0s
plot: [7,5] [========================================================================================================>----] 96% est: 0s
plot: [7,6] [==========================================================================================================>--] 98% est: 0s
plot: [7,7] [=============================================================================================================]100% est: 0s
data_num = select(data, subset = -c(1))
#Matrice des corrélations
# Calcul de la matrice
corr <- round(cor(data_num), 1)
# Calcul de la matrice de p-values de corrélation
p.mat <- cor_pmat(data_num)
# Visualiser le triangle inférieur de la matrice de corrélation & Barrer les coefficients non significatifs
corr.plot <- ggcorrplot(
corr, hc.order = TRUE, type = "lower", outline.col = "white",
p.mat = p.mat)
#corr.plot
ggplotly(corr.plot)
Warning in L$marker$color[idx] <- aes2plotly(data, params, "fill")[idx] :
number of items to replace is not a multiple of replacement length
#Matrice de corrélation inversée
corrplot(corr, type="upper", order="hclust", tl.col="black", tl.srt=45)
# Calculer les coefficients de corrélation
cor.coef <- cor(data_num)
# Calculer les p-values de corrélation
cor.test.p <- function(x){
FUN <- function(x, y) cor.test(x, y)[["p.value"]]
z <- outer(
colnames(x),
colnames(x),
Vectorize(function(i,j) FUN(x[,i], x[,j]))
)
dimnames(z) <- list(colnames(x), colnames(x))
z
}
p <- cor.test.p(data_num)
# Créer la Heatmap
heatmaply_cor(
cor.coef,
k_col = 2,
k_row = 2,
node_type = "scatter",
point_size_mat = -log10(p),
point_size_name = "-log10(p-value)", #on log pour ramener les chiffres entre 0 et 1 pour faciliter l'interprétation & les ordres de grandeur
label_names = c("x", "y", "Correlation")
)
#Corrélation négative entre la longueur et toutes les autres variables SAUF la diagonale, qui est statistiquement significative
#Corrélation positive importante entre la hauteur gauche et la hauteur droite
rm(cor.coef, corr, corr.plot, df, fig1, fig2, fig3, fig4, fig5, fig6, p, p.mat)
rm(cor.test.p, basic_eda)
#Calcul des individus moyens : moyenne des variables par catégories : True (1) / False (0)
indiv_moy = aggregate(data_num, list(data$auth), mean)
print(indiv_moy)
indiv_moy = select(indiv_moy, subset = -c(1))
#Calcul en amont des centroides
centroides = scale(data_ok[,-7], center = TRUE, scale = TRUE)
centroides = aggregate(centroides, list(data$auth), mean)
print(centroides)
rm(centroides, indiv_moy)
#ACP
#data_num_quant = select(data_num, subset = -c(7))
data_num_pca = PCA(data_num, scale.unit = TRUE, ncp = 7, graph = FALSE)
data_num_pca_var = get_pca_var(data_num_pca)
data_num_pca_ind = get_pca_ind(data_num_pca)
eig_val = get_eigenvalue(data_num_pca)
#Extraction des valeurs propres / variances des composantes principales - La proportion de variance expliquée par chaque valeur propre est donnée dans la deuxième colonne. Le pourcentage cumulé expliqué est obtenu en ajoutant les proportions successives de variances expliquées. Les valeurs propres peuvent être utilisées pour déterminer le nombre d’axes principaux à conserver après l’ACP (Kaiser 1961)
eig_val
eigenvalue variance.percent cumulative.variance.percent
Dim.1 2.8468752 47.447921 47.44792
Dim.2 1.3174264 21.957106 69.40503
Dim.3 0.8540715 14.234524 83.63955
Dim.4 0.5115777 8.526295 92.16585
Dim.5 0.2767693 4.612822 96.77867
Dim.6 0.1932799 3.221331 100.00000
fviz_eig(data_num_pca, addlabels = TRUE, ylim = c(0, 70)) #Visualisation des valeurs propres
get_pca_var(data_num_pca)#Extraction des résultats pour les les variables
Principal Component Analysis Results for variables
===================================================
Name Description
1 "$coord" "Coordinates for the variables"
2 "$cor" "Correlations between variables and dimensions"
3 "$cos2" "Cos2 for the variables"
4 "$contrib" "contributions of the variables"
#fviz_pca_var(data_num_pca)#visualisation des résultats des variables
#cos2 : qualité de représentation
var <- get_pca_var(data_num_pca)
corrplot(var$cos2, is.corr=TRUE)
fviz_pca_var(data_num_pca,
title='Cercle de corrélation',
col.var = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Évite le chevauchement de texte
)
fviz_pca_var(data_num_pca,
title='Cercle de contribution',
col.var = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07")
)
get_pca_ind(data_num_pca)#Extraction des résultats pour les individus
Principal Component Analysis Results for individuals
===================================================
Name Description
1 "$coord" "Coordinates for the individuals"
2 "$cos2" "Cos2 for the individuals"
3 "$contrib" "contributions of the individuals"
#fviz_pca_ind(data_num_pca)#visualisation des résultats des individus
fviz_pca_ind(data_num_pca,
geom.ind = "point", # Montre les points seulement (mais pas le "text")
col.ind = data$authentique, # colorier par groups
palette = "Set2",
addEllipses = TRUE,
#ellipse.type = "confidence", # Ellipses de confiance, concentration si non spécifié
legend.title = "Authticité",
title = "Plans 1 & 2"
)
fviz_pca_ind(data_num_pca,
axes = c(1,3),
geom.ind = "point", # Montre les points seulement (mais pas le "text")
col.ind = data$authentique, # colorier par groups
palette = "Set2",
addEllipses = TRUE,
#ellipse.type = "confidence", # Ellipses de confiance, concentration si non spécifié
legend.title = "Authticité",
title = "Plans 1 & 3"
)
fviz_pca_ind(data_num_pca,
axes = c(2,3),
geom.ind = "point", # Montre les points seulement (mais pas le "text")
col.ind = data$authentique, # colorier par groups
palette = "Set2",
addEllipses = TRUE,
#ellipse.type = "confidence", # Ellipses de confiance, concentration si non spécifié
legend.title = "Authticité",
title = "Plans 2 & 3"
)
fviz_contrib(data_num_pca, choice = "ind", addlabels = TRUE, ylim = c(0, 2), axes = 1:2)
#Ordonner la contribution des individus au regard des dimensions --> singulariser les billets les + / - contributifs
#fviz_pca_biplot(data_num_pca)# Création d’un biplot des individus et des variables.
fviz_pca_biplot (data_num_pca,
col.ind = data$authentique, palette = "jco",
addEllipses = TRUE, label = "var",
col.var = "black", repel = TRUE,
legend.title = "Authenticité",
title = "Projection des individus et des variables sur les deux premiers plans factoriels")
fviz_cos2(data_num_pca, choice = "var", axes = 1)
fviz_cos2(data_num_pca, choice = "var", axes = 2)
fviz_cos2(data_num_pca, choice = "var", axes = 3)
fviz_cos2(data_num_pca, choice = "var", axes = 1:2)
fviz_cos2(data_num_pca, choice = "var", axes = 1:3)
fviz_cos2(data_num_pca, choice = "ind", axes = 1)
fviz_cos2(data_num_pca, choice = "ind", axes = 2)
fviz_cos2(data_num_pca, choice = "ind", axes = 3)
fviz_cos2(data_num_pca, choice = "ind", axes = 1:2)
fviz_cos2(data_num_pca, choice = "ind", axes = 1:3)
fviz_cos2(
data_num_pca,
choice = "ind",
axes = 1:2,
fill = "steelblue",
color = "steelblue",
sort.val = "desc",
top = 25)
#Calculer k-means avec k = 2
set.seed(666)
res.km <- kmeans(scale(data_ok[, -7]), 2, nstart = 25) #On normalise les données avec scale (sans la dernière colonne), on veut 2 clusters et on fait 25 itérations
# Clustering K-means montrant le groupe de chaque individu
res_km = res.km$cluster
fviz_cluster(res.km, data = data_ok,
palette = "Set2",
geom = "point",
ellipse.type = "norm",
ggtheme = theme_bw(),
title = "Classification k-means"
)
Warning: argument title is deprecated; please use main instead.
#On cherche à savoir comment l'algo a classé les billets par rapport à la réalité
data_ok$res_km = res_km
data_ok$resultat = data_ok$auth - data_ok$res_km
table(data_ok$resultat) #On a 69 faux billets bien classés, 92 vrais billets bien classés et seulement 9 faux négatifs ! +1 faux positif --> recoder l'une des deux variable et refaire la différence
-2 -1 0
69 9 92
data_kmeans = data_ok
data_ok = select(data_ok, subset = -c(8, 9))
ctable <- table(data_kmeans$res_km, data_kmeans$auth)
rownames(ctable) <- c("False", "True")
colnames(ctable) <- c("Cluster1", "Cluster2")
ctable
Cluster1 Cluster2
False 1 92
True 69 8
#Comment visualiser graphiquement les deux partitions au même endroit ? --> crosstable entre les deux colonnes ? Matrice de confusion fourfoldplot
#On recode les variables : pour rappel dans la colonne auth : 1 = vrai billet et 0 = faux billet. Dans la colonne res_km, 1 = vrai billet, 2 = faux billet. Il faut donc tout mettre en 1 et 0
table_test_km = select(data_kmeans, subset = -c(9))
table_test_km$res_km[table_test_km$res_km>1] <- 0
#Matrice de confusion via Caret :
kmeans = as.factor(table_test_km$res_km)
auth_kmeans = as.factor(table_test_km$auth)
test_confu = confusionMatrix(data=kmeans, reference = auth_kmeans)
Registered S3 methods overwritten by 'proxy':
method from
print.registry_field registry
print.registry_entry registry
test_confu
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 69 8
1 1 92
Accuracy : 0.9471
95% CI : (0.9019, 0.9755)
No Information Rate : 0.5882
P-Value [Acc > NIR] : <2e-16
Kappa : 0.8923
Mcnemar's Test P-Value : 0.0455
Sensitivity : 0.9857
Specificity : 0.9200
Pos Pred Value : 0.8961
Neg Pred Value : 0.9892
Prevalence : 0.4118
Detection Rate : 0.4059
Detection Prevalence : 0.4529
Balanced Accuracy : 0.9529
'Positive' Class : 0
#rm(ctable, table_test_km, kmeans, auth_kmeans, test_confu, res.km, res_km, data_kmeans)
data_hk = data_ok[, -7] #sélection des données
data_scale <- scale(data_hk) #Centrage réduction
hk_results = hclust(dist(data_scale)) #, method = "ward.D") #Application du clustering
clusters = cutree(hk_results, 2) #Sortie des clusters
dendr_color = fviz_dend(hk_results, k = 2,
cex = 0.4,
palette = "Set1",
rect = TRUE,
rect_fill = TRUE,
rect_border = "Set1",
labels_track_height = 0.4)
Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.
plot(dendr_color) #dendrogramme
ctable <- table(clusters, data_ok$auth)
rownames(ctable) <- c("False", "True")
colnames(ctable) <- c("Cluster1", "Cluster2")
ctable
clusters Cluster1 Cluster2
False 69 24
True 1 76
confu_hk = table(clusters, data_ok$auth)
confu_hk
clusters 0 1
1 69 24
2 1 76
#(https://www.rdocumentation.org/packages/FactoMineR/versions/2.4/topics/HCPC)
hcpc_clust = HCPC(data_num_pca, nb.clust=2, consol=TRUE, iter.max=25, min=5,
max=NULL, metric="euclidean", method="ward", graph=FALSE, proba=0.05,
cluster.CA="rows",kk=Inf,description=TRUE)
hcpc_clusters = hcpc_clust$data.clust$clust #on isole les clusters
table_hcpc = table(hcpc_clusters, data_ok$auth)
table_hcpc
hcpc_clusters 0 1
1 1 92
2 69 8
#On obtient exactement les mêmes résultats qu'avec la méthode des kmeans !
data$authentique = factor(data$authentique, levels = c('True', 'False'), labels = c('vrai_billet', 'faux_billet')) #
# on créé les 4 labels :
classement_km = factor(paste(kmeans, data$authentique, sep = ' - '))
classement_hk = factor(paste(clusters, data$authentique, sep = ' - '))
classement_hcpc = factor(paste(hcpc_clusters, data$authentique, sep = ' - '))
fig_km = fviz_pca_ind(data_num_pca,
geom=c('point'),
pointshape = 19,
habillage = classement_km,
palette = c('#B20000', # 1 - faux billet
'#B26000', # 1 - vrai billet
'#00B2A0', # 2 - faux billet
'#00B233'), # 2 - vrai billet
alpha.ind="cos2",
ellipse.type = "norm",
mean.point = FALSE,
legend.title = "Légende",
title = "Visualisation k-means"
)
fig_hk = fviz_pca_ind(data_num_pca,
geom=c('point'),
pointshape = 19,
habillage = classement_hk,
palette = c('#B20000', # 1 - faux billet
'#B26000', # 1 - vrai billet
'#00B2A0', # 2 - faux billet
'#00B233'), # 2 - vrai billet
alpha.ind="cos2",
ellipse.type = "norm",
mean.point = FALSE,
legend.title = "Légende",
title = "Visualisation class. hiérarchique"
)
fig_hcpc = fviz_pca_ind(data_num_pca,
geom=c('point'),
pointshape = 19,
habillage = classement_hcpc,
palette = c('#B20000', # 1 - faux billet
'#B26000', # 1 - vrai billet
'#00B2A0', # 2 - faux billet
'#00B233'), # 2 - vrai billet
alpha.ind="cos2",
ellipse.type = "norm",
mean.point = FALSE,
legend.title = "Légende",
title = "Visualisation HCPC"
)
fig_km
fig_hk
fig_hcpc
rm(data_hk, data_kmeans, data_num_pca_ind, data_num_pca_var, data_scale, dendr_color, eig_val, fig_hcpc, fig_hk, fig_km, hcpc_clust, res.km, table_test_km, test_confu, var, auth_kmeans, classement_hcpc, classement_hk, classement_km, clusters, confu_hk, ctable, hcpc_clusters, kmeans, res_km, table_hcpc, hk_results, data_test)
rm(data, data_num, data_num_pca, data_ok)
#Mission 3 - Modélisez les données à l’aide d’une régression logistique
print(prop.table(table(data$is_genuine)))
False True
0.4117647 0.5882353
set.seed(100)
trainIndex <- createDataPartition(data$is_genuine,p=0.8,list=F)
print(length(trainIndex))
[1] 136
#Pour l'ensemble d'apprentissage
dataTrain <- data[trainIndex,]
print(dim(dataTrain))
[1] 136 7
#Pour l'échantillon test, via l'indiçage négatif
dataTest <- data[-trainIndex,]
print(dim(dataTest))
[1] 34 7
#fréquences absolues des classes - éch. d'apprentissage
print(table(dataTrain$is_genuine))
False True
56 80
#fréquences relatives des classes dans l'éch. d'apprentissage
print(prop.table(table(dataTrain$is_genuine)))
False True
0.4117647 0.5882353
#fréquences absolues des classes - éch. d'test
print(table(dataTest$is_genuine))
False True
14 20
#distribution des classes dans l'éch. test
print(prop.table(table(dataTest$is_genuine)))
False True
0.4117647 0.5882353
# --> Les fréquences relatives sont conformes avec la distribution initiale
#paramètre du processu d'apprentissage : on laisse tout par défaut
fitControl <- trainControl(method="none")
#apprentissage - régression logistique
m_lr <- train(is_genuine ~ ., data = dataTrain,method="glm",trControl=fitControl)
Warning: glm.fit: algorithm did not converge
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
print(m_lr)
Generalized Linear Model
136 samples
6 predictor
2 classes: 'False', 'True'
No pre-processing
Resampling: None
#modèle sous-jacent issu de train
#coefficients de la régression logistique
print(m_lr$finalModel)
Call: NULL
Coefficients:
(Intercept) diagonal height_left height_right margin_low margin_up length
-2355.791 3.726 -70.844 37.852 -84.981 -87.064 51.958
Degrees of Freedom: 135 Total (i.e. Null); 129 Residual
Null Deviance: 184.3
Residual Deviance: 1.052e-08 AIC: 14
#AIC du modèle à 14
pred <- predict(m_lr,newdata=dataTest)
#distribution des classes prédites
print(table(pred))
pred
False True
13 21
#On a 13 prédiction de faux billets et 21 de vrais billets, à vérifier par la suite
mat <- confusionMatrix(data=pred,reference=as.factor(dataTest$is_genuine),positive="True")
print(mat)
Confusion Matrix and Statistics
Reference
Prediction False True
False 13 0
True 1 20
Accuracy : 0.9706
95% CI : (0.8467, 0.9993)
No Information Rate : 0.5882
P-Value [Acc > NIR] : 3.624e-07
Kappa : 0.9386
Mcnemar's Test P-Value : 1
Sensitivity : 1.0000
Specificity : 0.9286
Pos Pred Value : 0.9524
Neg Pred Value : 1.0000
Prevalence : 0.5882
Detection Rate : 0.5882
Detection Prevalence : 0.6176
Balanced Accuracy : 0.9643
'Positive' Class : True
#Taux de succès (accuracy) à 97% ! L'intervalle de confiance à 95% est fourni mais l'échantillon étant faible, l'incertitude persiste.
#On constate un faux positif.
#La sensibilité à la classe positive (True) s'établit à 100% (20/(20+0) !
print(mat$byClass)
Sensitivity Specificity Pos Pred Value Neg Pred Value Precision Recall
1.0000000 0.9285714 0.9523810 1.0000000 0.9523810 1.0000000
F1 Prevalence Detection Rate Detection Prevalence Balanced Accuracy
0.9756098 0.5882353 0.5882353 0.6176471 0.9642857
# Précision sur l'échantillon de test : 95% !
#score des individus positifs
score <- predict(m_lr,dataTest,type="prob")[,"True"]
#print(quantile(score))
#On crée un data frame regroupant les classes observées et les scores
liftdata <- data.frame(classe=as.factor(dataTest$is_genuine))
liftdata$score <- score
#objet lift
lift_obj <- lift(classe ~ score, data=liftdata, class="True")
print(lift_obj) #La fonction print() indique seulement la proportion des observations positives (is_genuine = True).
Call:
lift.formula(x = classe ~ score, data = liftdata, class = "True")
Models: score
Event: True (58.8%)
#affichage de la courbe lift
plot(lift_obj)
#Score
score <- predict(m_lr,dataTest,type="prob")[,"True"]
#objet roc
roc_obj <- roc(dataTest$is_genuine=="True",score)
Setting levels: control = FALSE, case = TRUE
Setting direction: controls < cases
#plot de l'objet roc
plot(1-roc_obj$specificities, roc_obj$sensitivities, type="l")
abline(0,1)
#Peut-être un problème au niveau de l'encodage true/false --> assayer avec des 0/1 pour voir
print(roc_obj$auc)
Area under the curve: 1
#évaluation par rééchantillonnage : validation croisée avec 10 blocs,
fitControl <- trainControl(method="cv",number=10) #10 blocs
m_lr <- train(is_genuine ~ ., data = dataTrain,method="glm",trControl=fitControl)
Warning: glm.fit: algorithm did not converge
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: algorithm did not converge
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: algorithm did not converge
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: algorithm did not converge
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: algorithm did not converge
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: algorithm did not converge
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: algorithm did not converge
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: algorithm did not converge
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: algorithm did not converge
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: algorithm did not converge
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: algorithm did not converge
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
print(m_lr)
Generalized Linear Model
136 samples
6 predictor
2 classes: 'False', 'True'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 122, 123, 122, 122, 122, 122, ...
Resampling results:
Accuracy Kappa
0.978022 0.955158
print(m_lr$resample)
#importance des variables - intrinsèque au modèle
print(varImp(m_lr))
glm variable importance
#Variable la plus influente : longueur, puis marges, puis hauteurs, puis diagonale
m_lrs <- train(is_genuine ~ ., data = dataTrain, method="glmStepAIC",
trControl=trainControl("none"))
Warning: glm.fit: algorithm did not converge
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Start: AIC=14
.outcome ~ diagonal + height_left + height_right + margin_low +
margin_up + length
Warning: glm.fit: algorithm did not converge
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: algorithm did not converge
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: algorithm did not converge
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: algorithm did not converge
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: algorithm did not converge
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Df Deviance AIC
- diagonal 1 0.000 12.000
- height_right 1 0.000 12.000
- height_left 1 0.000 12.000
- margin_up 1 0.000 12.000
- length 1 0.000 12.000
<none> 0.000 14.000
- margin_low 1 21.779 33.779
Warning: glm.fit: algorithm did not converge
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Step: AIC=12
.outcome ~ height_left + height_right + margin_low + margin_up +
length
Warning: glm.fit: algorithm did not converge
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: algorithm did not converge
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: algorithm did not converge
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: algorithm did not converge
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Df Deviance AIC
- height_right 1 0.000 10.000
- height_left 1 0.000 10.000
- margin_up 1 0.000 10.000
- length 1 0.000 10.000
<none> 0.000 12.000
- margin_low 1 30.348 40.347
Warning: glm.fit: algorithm did not converge
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Step: AIC=10
.outcome ~ height_left + margin_low + margin_up + length
Warning: glm.fit: algorithm did not converge
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: algorithm did not converge
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: algorithm did not converge
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Df Deviance AIC
- height_left 1 0.000 8.000
- margin_up 1 0.000 8.000
- length 1 0.000 8.000
<none> 0.000 10.000
- margin_low 1 39.357 47.357
Warning: glm.fit: algorithm did not converge
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Step: AIC=8
.outcome ~ margin_low + margin_up + length
Warning: glm.fit: algorithm did not converge
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Df Deviance AIC
- margin_up 1 0.000 6.000
<none> 0.000 8.000
- length 1 6.654 12.654
- margin_low 1 47.829 53.829
Warning: glm.fit: algorithm did not converge
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Step: AIC=6
.outcome ~ margin_low + length
Df Deviance AIC
<none> 0.000 6.000
- margin_low 1 54.882 58.882
- length 1 62.316 66.316
#Modèle final obtenu :
print(m_lrs$finalModel)
Call: NULL
Coefficients:
(Intercept) margin_low length
-21884.2 -318.8 207.9
Degrees of Freedom: 135 Total (i.e. Null); 133 Residual
Null Deviance: 184.3
Residual Deviance: 8.055e-08 AIC: 6
#application sur les données test & mesure des performances
print(confusionMatrix(data=predict(m_lrs,newdata =
dataTest),reference=as.factor(dataTest$is_genuine),positive="True"))
Confusion Matrix and Statistics
Reference
Prediction False True
False 13 0
True 1 20
Accuracy : 0.9706
95% CI : (0.8467, 0.9993)
No Information Rate : 0.5882
P-Value [Acc > NIR] : 3.624e-07
Kappa : 0.9386
Mcnemar's Test P-Value : 1
Sensitivity : 1.0000
Specificity : 0.9286
Pos Pred Value : 0.9524
Neg Pred Value : 1.0000
Prevalence : 0.5882
Detection Rate : 0.5882
Detection Prevalence : 0.6176
Balanced Accuracy : 0.9643
'Positive' Class : True